import shutil
import tempfile
import pybedtools
from numpy import *
from pylab import *
from scipy.stats import nbinom, pearsonr

import warnings
warnings.filterwarnings('error')

from numpy.random import MT19937
from numpy.random import RandomState, SeedSequence

rs = RandomState(MT19937(SeedSequence(12345678)))

def read_expression():
    filename = "peaks.MiSeq_HiSeq.expression.txt"
    print("Reading", filename)
    stream = open(filename)
    line = next(stream)
    words = line.split()
    libraries = words[1:]
    assert words[0] == 'peak'
    assert all([library.startswith("MiSeq") for library in libraries[:17]])
    assert all([library.startswith("HiSeq") for library in libraries[17:]])
    counts = []
    for line in stream:
        words = line.split()
        row = array(words[1:], int)
        counts.append(row)
    stream.close()
    counts = array(counts)
    total = sum(counts, 0)
    mean_total_miseq = mean(total[:17])
    mean_total_hiseq = mean(total[17:])
    fractions = counts / total
    fractions = mean(fractions[:, 17:], 1)
    print("Mean number of counts, MiSeq libraries: %d" % mean_total_miseq)
    print("Mean number of counts, HiSeq libraries: %d" % mean_total_hiseq)
    return mean_total_miseq, mean_total_hiseq, fractions


def generate_counts(fractions, total, dispersion):
    counts = zeros(len(fractions))
    n = dispersion
    for i, fraction in enumerate(fractions):
        p = n / (fraction * total + n)
        counts[i] = nbinom.rvs(n, p, size=1, random_state=rs)
    return counts

total_miseq, total_hiseq, fractions = read_expression()

dispersion = 1.75
counts_hiseq = generate_counts(fractions, total_hiseq, dispersion)
counts_miseq = generate_counts(fractions, total_miseq, dispersion)
correlation, pvalue = pearsonr(counts_miseq, counts_hiseq)
print("Correlation of raw data, MiSeq read depth vs HiSeq read depth: %.2f" % correlation)
correlation, pvalue = pearsonr(log(counts_miseq+1), log(counts_hiseq+1))
print("Correlation of log-transformed data, MiSeq read depth vs HiSeq read depth: %.2f" % correlation)
counts_hiseq = generate_counts(fractions, total_hiseq, dispersion)
counts_miseq = generate_counts(fractions, total_hiseq, dispersion)
correlation, pvalue = pearsonr(counts_miseq, counts_hiseq)
print("Correlation of raw data, HiSeq read depth vs HiSeq read depth: %.2f" % correlation)
correlation, pvalue = pearsonr(log(counts_miseq+1), log(counts_hiseq+1))
print("Correlation of log-transformed data, HiSeq read depth vs HiSeq read depth: %.2f" % correlation)
